IP_ADDRESSES=$(hostname -I | tr ' ' '\n' | grep -v '^127\.0\.0\.1$' | grep -v '172*')

export PYTHONPATH=/usr/local/Ascend:$PYTHONPATH
export VLLM_LLMDATADIST_ZMQ_PORT=5570
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# ranktable文件按实际机器ip和卡数修改
export RANK_TABLE_PATH=/data/lumin/omni_infer/tools/scripts/global_path
export GLOBAL_RANK_TABLE_FILE_PATH=/data/lumin/omni_infer/tools/scripts/global_path/global_ranktable_merge.json
export RANK_TABLE_FILE_PATH=/data/lumin/omni_infer/tools/scripts/prefill-ranktable/local_ranktable_7.150.13.139_01234567.json

export LOCAL_DECODE_SERVER_IP_LIST="7.150.13.139"
export GLOBAL_DECODE_SERVER_IP_LIST="7.150.13.139"
export ROLE=prefill
# export MODEL_EXTRA_CFG_PATH="/home/omni_infer/tests/st/deepseek_v3/test_prefill.json"

export HCCL_INTRA_ROCE_ENABLE=1
export HCCL_INTRA_PCIE_ENABLE=0

# 调精度用
export HCCL_DETERMINISTIC=true
export CLOSE_MATMUL_K_SHIFT=1

# 调试用，算子同步
# export ASCEND_LAUNCH_BLOCKING=1

# 1P1D时配置如下
export PREFILL_POD_NUM=1
export DECODE_POD_NUM=1

export GLOO_SOCKET_IFNAME=enp23s0f3
export TP_SOCKET_IFNAME=enp23s0f3
# export ASCEND_RT_VISIBLE_DEVICES=6
export VLLM_USE_V1=1
export VLLM_WORKER_MULTIPROC_METHOD=fork
# export VLLM_ENABLE_MC2=0
# export USING_LCCL_COM=0

# export FORWARD_TIME=30

export VLLM_LOGGING_LEVEL=INFO
export OMNI_USE_QWEN=1


# python /data/lumin/omni_infer/tools/scripts/start_api_servers.py \
#     --num-servers 1 \
#     --model-path /data/model/QwQ-32B \
#     --master-ip 7.150.13.139 \
#     --master-port 8503 \
#     --base-api-port 6660 \
#     --tp 8 \
#     --served-model-name qwen \
#     --max-model-len 12288 \
#     --log-dir ./apiserverlog \
#     --no-enable-prefix-caching \
#     --gpu-util 0.95 \
#     --extra-args "--max-num-batched-tokens 12288 --enforce-eager " \
#     --kv-transfer-config \
#      '{
#         "kv_connector": "AscendHcclConnectorV1",
#         "kv_buffer_device": "npu",
#         "kv_role": "kv_producer",
#         "kv_rank": 0,
#         "engine_id": 0,
#         "kv_parallel_size": 2
#     }' &



python /data/lumin/omni_infer/tools/scripts/start_api_servers.py \
    --num-servers 1 \
    --model-path /data/model/QwQ-32B \
    --master-ip 7.150.13.139 \
    --master-port 8503 \
    --base-api-port 6660 \
    --tp 8 \
    --served-model-name qwen \
    --max-model-len 40960 \
    --log-dir ./apiserverlog \
    --no-enable-prefix-caching \
    --no-enable-chunked-prefill \
    --gpu-util 0.9 \
    --extra-args "--max-num-batched-tokens 40960 --enforce-eager " \
    --kv-transfer-config \
     '{
        "kv_connector": "AscendHcclConnectorV1",
        "kv_buffer_device": "npu",
        "kv_role": "kv_producer",
        "kv_rank": 0,
        "engine_id": 0,
        "kv_parallel_size": 2
    }' &
